Project authors and participants:
Dont forget the upvote if you liked the post!
Each row contains the text of a tweet and a sentiment label. In the training set you are provided with a word or phrase drawn from the tweet (selected_text) that encapsulates the provided sentiment.
Premise:
Objective: predict the word or phrase from the tweet that exemplifies the provided sentiment
Load dependencies:
library(tidymodels) # tidy machine learning
library(readr) # read/write
library(dplyr) # manipulate data
library(tidyr) # tidy data
library(purrr) # functional programming
library(stringr) # text manipulation
library(qdapRegex) # easy regex
library(tm) # text mining
library(tidytext) # text mining
library(ggplot2) # elegant graphs
library(patchwork) # grid ggplot
library(doParallel) # parallel process
library(foreach) # parallel process
theme_set(theme_bw()) # set theme
# Install external package:
if(require(textfeatures) == T){ library(textfeatures) } else{
devtools::install("../input/r-textfeatures-package/textfeatures/")
library(textfeatures)
}
ncores <- 4
Function to evaluate model performance:
jaccard <- function(str1, str2) {
# r version for:
# https://www.kaggle.com/c/tweet-sentiment-extraction/overview/evaluation
a <- unlist(strsplit(tolower(str1), split = " "))
b <- unlist(strsplit(tolower(str2), split = " "))
c <- intersect(a, b)
length(c) / (length(a) + length(b) - length(c))
}
Load available training and test data:
train_data <- read_csv("../data/train.csv") %>% rename(sel_text = selected_text) %>% sample_n(1000)
test_data <- read_csv("../data/test.csv")
Remove missing:
# remove na
train_data <- train_data %>% filter(!is.na(text) | text == "")
Check Jaccard by sentiment using full text:
train_data %>%
rowwise() %>%
mutate(jaccard = jaccard(text, sel_text)) %>%
ungroup() %>%
group_by(sentiment) %>%
summarise(jaccard = mean(jaccard))
FALSE # A tibble: 3 x 2
FALSE sentiment jaccard
FALSE <chr> <dbl>
FALSE 1 negative 0.327
FALSE 2 neutral 0.914
FALSE 3 positive 0.305
Note that the jaccard of the neutral feeling is quite high when selecting all the text. So, let’s hold all the neutral texts before modeling
# Train
train_neutral <- train_data %>% filter(sentiment == "neutral")
train_data <- train_data %>% filter(sentiment != "neutral")
# Test
test_neutral <- test_data %>% filter(sentiment == "neutral")
test_data <- test_data %>% filter(sentiment != "neutral")
Remove lines from the training data where sel_text is not contained in the text
bad_text <- train_data %>%
mutate(texts = map(text, ~str_split(.x, " ")[[1]]),
sel_texts = map(sel_text, ~str_split(.x, " ")[[1]]),
bad_text = map2_lgl(texts, sel_texts, ~ sum(.x %in% .y)==0) ) %>%
pull(bad_text)
train_data <- train_data[!bad_text,]
25 lines have been removed.
We will collect all possible ngrams to train a regression model that estimates a jaccard for each piece.
Ngrams are like this:
code:
# Aux function to search special character:
to_search <- function(x){
str_replace_all(x, "([[:punct:]]|\\*|\\+|\\.{1,}|\\:|\\$|\\:|\\^|\\?|\\|)", "\\\\\\1")
}
# train
train_ngrams <-
train_data %>%
mutate(n_words = map_dbl(text, ~str_split(.x, pattern = " ", )[[1]] %>% length())) %>%
mutate(ngram_text = map2(text, n_words, function(text, n_words){
map(1:n_words,
~ tau::textcnt(text, method = "string", split = " ", n = .x, tolower = FALSE) %>% names() %>% unlist()
) } )) %>%
mutate(ngram_text = map(ngram_text, unlist)) %>%
unnest(cols = c(ngram_text)) %>%
mutate(sel = ngram_text == sel_text) %>%
mutate(dif_text = str_remove(text, to_search(ngram_text)))
# test
test_ngrams <-
test_data %>%
mutate(n_words = map_dbl(text, ~str_split(.x, pattern = " ", )[[1]] %>% length())) %>%
mutate(ngram_text = map2(text, n_words, function(text, n_words){
map(1:n_words,
~ tau::textcnt(text, method = "string", split = " ", n = .x, tolower = FALSE) %>% names() %>% unlist()
) } )) %>%
mutate(ngram_text = map(ngram_text, unlist)) %>%
unnest(cols = c(ngram_text)) %>%
mutate(dif_text = str_remove(text, to_search(ngram_text)))
Calcule Jaccard betweed each ngram and sel_text (target)
train_ngrams <- train_ngrams %>%
mutate(jaccard = map2_dbl(sel_text, ngram_text, ~jaccard(.x, .y)))
Now let’s remove more bad lines than where the ngram is not contained in the text:
to_remove <-
train_ngrams %>%
group_by(textID) %>%
nest() %>%
ungroup() %>%
mutate(sel = map_lgl(data, ~any(.x$ngram_text == .x$sel_text))) %>%
filter(sel != T) %>%
pull(textID)
train_ngrams <- train_ngrams %>% filter(!textID %in% to_remove)
111 rows are removed
In this step, pipelines will be developed to collect metadata and parse the columns text,ngram_tex and dif_text.
Let’s fit the regression model with tabular information about the text.
We developed a function that collects the metadata of each text / ngram / dif text:
get_metadata <- function(x, verbose = F){
if(verbose == T){
t0 <- Sys.time() # to print time
cat("Getting metadata, please wait ..\n")
}
# get metadata with `textfeatures`
metadata <- textfeatures::textfeatures(x, normalize = F, word_dims = 0, verbose = verbose)
# discart default n_words and n_uq_words
metadata <- metadata %>% select(-n_words, -n_uq_words)
# more features
# quantas ngrams possiveis?
# qual ngram antes e qual depois
metadata <-
tibble(text = x) %>%
rowwise() %>%
mutate(
n_words = length(str_split(text, pattern = " ")[[1]]),
n_uq_words = length(unique(str_split(text, pattern = " ")[[1]]))) %>%
ungroup() %>%
transmute(
n_vogals = str_count(str_to_lower(text), "[aeiou]"),
n_consonants = str_count(str_to_lower(text), "[bcdfghjklmnpqrstvwxyz]"),
n_str = str_length(text),
# n_upper = str_count(text, "[A-Z]"), # n_caps
n_neg = str_count(str_to_lower(text), "(\\bno+\\b|\\bnor+\\b|\\bnot+\\b|n\\'t\\b)"), # negatives
n_atpeople = str_count(text, "@\\w+"),
n_question = str_count(text, "\\?+"),
# n_dot = str_count(text, "\\.+"), # n_period
n_retweet = str_count(text, "(RT|via)((?:\\b\\W*@\\w+)+)")
) %>%
bind_cols(metadata)
# combine plural person in metadata
metadata <- metadata %>%
mutate(n_first_person = n_first_person + n_first_personp,
n_second_person = n_second_person + n_second_personp) %>%
select(-n_first_personp, -n_second_personp)
if(verbose == T){
cat(paste0("Metadata successfully obtained!\nThe process took: ",
round(difftime(Sys.time(), t0, units = "mins")) ," min\n")) # Yeah!
}
return(metadata)
}
get metadata from train:
# get text metadata
text_metadata <-
bind_cols(tibble(textID = train_data$textID), get_metadata(train_data$text, verbose = T) %>%
`colnames<-`(paste0("text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 0 min
# get sel_text metadata
sel_text_metadata <-
bind_cols(tibble(textID = train_ngrams$textID), get_metadata(train_ngrams$ngram_text, verbose = T) %>%
`colnames<-`(paste0("sel_text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 3 min
# get dif_text metadata
dif_text_metadata <-
bind_cols(tibble(textID = train_ngrams$textID), get_metadata(train_ngrams$dif_text, verbose = T) %>%
`colnames<-`(paste0("dif_text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 3 min
# join all in metadata
train_metadata <-
left_join(
bind_cols(sel_text_metadata, select(dif_text_metadata, -textID)),
bind_cols(train_data, select(text_metadata, -textID)),
by = "textID"
) %>%
bind_cols(select(train_ngrams, ngram_text, dif_text, jaccard, n_words)) %>%
select(textID, text, sel_text, ngram_text, dif_text, sentiment, n_words, jaccard, everything())
saveRDS(train_metadata, "../data/train_metadata.rds")
train_metadata
FALSE # A tibble: 52,911 x 119
FALSE textID text sel_text ngram_text dif_text sentiment n_words jaccard
FALSE <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
FALSE 1 ff31a… Got … I knew … Got " to go… positive 17 0
FALSE 2 ff31a… Got … I knew … I "Got to… positive 17 0.125
FALSE 3 ff31a… Got … I knew … Tesco's. "Got to… positive 17 0
FALSE 4 ff31a… Got … I knew … day "Got to… positive 17 0.125
FALSE 5 ff31a… Got … I knew … down "Got to… positive 17 0
FALSE 6 ff31a… Got … I knew … go "Got to… positive 17 0
FALSE 7 ff31a… Got … I knew … going "Got to… positive 17 0.125
FALSE 8 ff31a… Got … I knew … knew "Got to… positive 17 0.125
FALSE 9 ff31a… Got … I knew … my "Got to… positive 17 0.125
FALSE 10 ff31a… Got … I knew … shopping "Got to… positive 17 0
FALSE # … with 52,901 more rows, and 111 more variables: sel_text_n_vogals <int>,
FALSE # sel_text_n_consonants <int>, sel_text_n_str <int>, sel_text_n_neg <int>,
FALSE # sel_text_n_atpeople <int>, sel_text_n_question <int>,
FALSE # sel_text_n_retweet <int>, sel_text_n_urls <int>, sel_text_n_uq_urls <int>,
FALSE # sel_text_n_hashtags <int>, sel_text_n_uq_hashtags <int>,
FALSE # sel_text_n_mentions <int>, sel_text_n_uq_mentions <int>,
FALSE # sel_text_n_chars <int>, sel_text_n_uq_chars <int>, sel_text_n_commas <int>,
FALSE # sel_text_n_digits <int>, sel_text_n_exclaims <int>,
FALSE # sel_text_n_extraspaces <int>, sel_text_n_lowers <int>,
FALSE # sel_text_n_lowersp <dbl>, sel_text_n_periods <int>, sel_text_n_caps <int>,
FALSE # sel_text_n_nonasciis <int>, sel_text_n_puncts <int>,
FALSE # sel_text_n_capsp <dbl>, sel_text_n_charsperword <dbl>,
FALSE # sel_text_sent_afinn <dbl>, sel_text_sent_bing <dbl>,
FALSE # sel_text_sent_syuzhet <dbl>, sel_text_sent_vader <dbl>,
FALSE # sel_text_n_polite <dbl>, sel_text_n_first_person <int>,
FALSE # sel_text_n_second_person <int>, sel_text_n_third_person <int>,
FALSE # sel_text_n_tobe <int>, sel_text_n_prepositions <int>,
FALSE # dif_text_n_vogals <int>, dif_text_n_consonants <int>, dif_text_n_str <int>,
FALSE # dif_text_n_neg <int>, dif_text_n_atpeople <int>, dif_text_n_question <int>,
FALSE # dif_text_n_retweet <int>, dif_text_n_urls <int>, dif_text_n_uq_urls <int>,
FALSE # dif_text_n_hashtags <int>, dif_text_n_uq_hashtags <int>,
FALSE # dif_text_n_mentions <int>, dif_text_n_uq_mentions <int>,
FALSE # dif_text_n_chars <int>, dif_text_n_uq_chars <int>, dif_text_n_commas <int>,
FALSE # dif_text_n_digits <int>, dif_text_n_exclaims <int>,
FALSE # dif_text_n_extraspaces <int>, dif_text_n_lowers <int>,
FALSE # dif_text_n_lowersp <dbl>, dif_text_n_periods <int>, dif_text_n_caps <int>,
FALSE # dif_text_n_nonasciis <int>, dif_text_n_puncts <int>,
FALSE # dif_text_n_capsp <dbl>, dif_text_n_charsperword <dbl>,
FALSE # dif_text_sent_afinn <dbl>, dif_text_sent_bing <dbl>,
FALSE # dif_text_sent_syuzhet <dbl>, dif_text_sent_vader <dbl>,
FALSE # dif_text_n_polite <dbl>, dif_text_n_first_person <int>,
FALSE # dif_text_n_second_person <int>, dif_text_n_third_person <int>,
FALSE # dif_text_n_tobe <int>, dif_text_n_prepositions <int>, text_n_vogals <int>,
FALSE # text_n_consonants <int>, text_n_str <int>, text_n_neg <int>,
FALSE # text_n_atpeople <int>, text_n_question <int>, text_n_retweet <int>,
FALSE # text_n_urls <int>, text_n_uq_urls <int>, text_n_hashtags <int>,
FALSE # text_n_uq_hashtags <int>, text_n_mentions <int>, text_n_uq_mentions <int>,
FALSE # text_n_chars <int>, text_n_uq_chars <int>, text_n_commas <int>,
FALSE # text_n_digits <int>, text_n_exclaims <int>, text_n_extraspaces <int>,
FALSE # text_n_lowers <int>, text_n_lowersp <dbl>, text_n_periods <int>,
FALSE # text_n_caps <int>, text_n_nonasciis <int>, text_n_puncts <int>,
FALSE # text_n_capsp <dbl>, …
get metadata from test:
# get text metadata
text_metadata <-
bind_cols(tibble(textID = test_data$textID), get_metadata(test_data$text, verbose = T) %>%
`colnames<-`(paste0("text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 0 min
# get sel_text metadata
sel_text_metadata <-
bind_cols(tibble(textID = test_ngrams$textID), get_metadata(test_ngrams$ngram_text, verbose = T) %>%
`colnames<-`(paste0("sel_text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 14 min
# get dif_text metadata
dif_text_metadata <-
bind_cols(tibble(textID = test_ngrams$textID), get_metadata(test_ngrams$dif_text, verbose = T) %>%
`colnames<-`(paste0("dif_text_",colnames(.))))
FALSE Getting metadata, please wait ..
FALSE [32m↪[39m [38;5;244mCounting features in text...[39m
FALSE [32m↪[39m [38;5;244mSentiment analysis...[39m
FALSE [32m↪[39m [38;5;244mParts of speech...[39m
FALSE [32m↪[39m [38;5;244mWord dimensions started[39m
FALSE [32m✔[39m Job's done!
FALSE Metadata successfully obtained!
FALSE The process took: 15 min
# join all in metadata
test_metadata <-
left_join(
bind_cols(sel_text_metadata, select(dif_text_metadata, -textID)),
bind_cols(test_data, select(text_metadata, -textID)),
by = "textID"
) %>%
bind_cols(select(test_ngrams, ngram_text, dif_text, n_words)) %>%
select(textID, text, ngram_text, dif_text, sentiment, n_words, everything())
saveRDS(test_metadata, "../data/test_metadata.rds")
test_metadata
FALSE # A tibble: 237,605 x 117
FALSE textID text ngram_text dif_text sentiment n_words sel_text_n_voga…
FALSE <chr> <chr> <chr> <chr> <chr> <dbl> <int>
FALSE 1 11aa4… http… - "http:/… positive 13 0
FALSE 2 11aa4… http… Malta "http:/… positive 13 2
FALSE 3 11aa4… http… but "http:/… positive 13 1
FALSE 4 11aa4… http… calling "http:/… positive 13 2
FALSE 5 11aa4… http… can't "http:/… positive 13 1
FALSE 6 11aa4… http… from "http:/… positive 13 1
FALSE 7 11aa4… http… http://tw… " - i w… positive 13 3
FALSE 8 11aa4… http… i "http:/… positive 13 1
FALSE 9 11aa4… http… was "http:/… positive 13 1
FALSE 10 11aa4… http… wish "http:/… positive 13 1
FALSE # … with 237,595 more rows, and 110 more variables:
FALSE # sel_text_n_consonants <int>, sel_text_n_str <int>, sel_text_n_neg <int>,
FALSE # sel_text_n_atpeople <int>, sel_text_n_question <int>,
FALSE # sel_text_n_retweet <int>, sel_text_n_urls <int>, sel_text_n_uq_urls <int>,
FALSE # sel_text_n_hashtags <int>, sel_text_n_uq_hashtags <int>,
FALSE # sel_text_n_mentions <int>, sel_text_n_uq_mentions <int>,
FALSE # sel_text_n_chars <int>, sel_text_n_uq_chars <int>, sel_text_n_commas <int>,
FALSE # sel_text_n_digits <int>, sel_text_n_exclaims <int>,
FALSE # sel_text_n_extraspaces <int>, sel_text_n_lowers <int>,
FALSE # sel_text_n_lowersp <dbl>, sel_text_n_periods <int>, sel_text_n_caps <int>,
FALSE # sel_text_n_nonasciis <int>, sel_text_n_puncts <int>,
FALSE # sel_text_n_capsp <dbl>, sel_text_n_charsperword <dbl>,
FALSE # sel_text_sent_afinn <dbl>, sel_text_sent_bing <dbl>,
FALSE # sel_text_sent_syuzhet <dbl>, sel_text_sent_vader <dbl>,
FALSE # sel_text_n_polite <dbl>, sel_text_n_first_person <int>,
FALSE # sel_text_n_second_person <int>, sel_text_n_third_person <int>,
FALSE # sel_text_n_tobe <int>, sel_text_n_prepositions <int>,
FALSE # dif_text_n_vogals <int>, dif_text_n_consonants <int>, dif_text_n_str <int>,
FALSE # dif_text_n_neg <int>, dif_text_n_atpeople <int>, dif_text_n_question <int>,
FALSE # dif_text_n_retweet <int>, dif_text_n_urls <int>, dif_text_n_uq_urls <int>,
FALSE # dif_text_n_hashtags <int>, dif_text_n_uq_hashtags <int>,
FALSE # dif_text_n_mentions <int>, dif_text_n_uq_mentions <int>,
FALSE # dif_text_n_chars <int>, dif_text_n_uq_chars <int>, dif_text_n_commas <int>,
FALSE # dif_text_n_digits <int>, dif_text_n_exclaims <int>,
FALSE # dif_text_n_extraspaces <int>, dif_text_n_lowers <int>,
FALSE # dif_text_n_lowersp <dbl>, dif_text_n_periods <int>, dif_text_n_caps <int>,
FALSE # dif_text_n_nonasciis <int>, dif_text_n_puncts <int>,
FALSE # dif_text_n_capsp <dbl>, dif_text_n_charsperword <dbl>,
FALSE # dif_text_sent_afinn <dbl>, dif_text_sent_bing <dbl>,
FALSE # dif_text_sent_syuzhet <dbl>, dif_text_sent_vader <dbl>,
FALSE # dif_text_n_polite <dbl>, dif_text_n_first_person <int>,
FALSE # dif_text_n_second_person <int>, dif_text_n_third_person <int>,
FALSE # dif_text_n_tobe <int>, dif_text_n_prepositions <int>, text_n_vogals <int>,
FALSE # text_n_consonants <int>, text_n_str <int>, text_n_neg <int>,
FALSE # text_n_atpeople <int>, text_n_question <int>, text_n_retweet <int>,
FALSE # text_n_urls <int>, text_n_uq_urls <int>, text_n_hashtags <int>,
FALSE # text_n_uq_hashtags <int>, text_n_mentions <int>, text_n_uq_mentions <int>,
FALSE # text_n_chars <int>, text_n_uq_chars <int>, text_n_commas <int>,
FALSE # text_n_digits <int>, text_n_exclaims <int>, text_n_extraspaces <int>,
FALSE # text_n_lowers <int>, text_n_lowersp <dbl>, text_n_periods <int>,
FALSE # text_n_caps <int>, text_n_nonasciis <int>, text_n_puncts <int>,
FALSE # text_n_capsp <dbl>, text_n_charsperword <dbl>, …
Function developed to calculate statistics for each ngram in relation to the entire text and for each one that remains after removing the ngrams in relation to the entire text
parse_metadata <- function(metadata, test = F){
metadata <-
metadata %>%
mutate(
# text stats
text_n_words = n_words,
# text_n_lowersp,
# text_n_capsp,
# text_n_charsperword,
# sel_text stats
sel_text_n_words = map_dbl(ngram_text, ~length(str_split(.x, pattern = " ")[[1]])),
# sel_text_n_lowersp,
# sel_text_n_capsp,
# sel_text_n_charsperword,
# interaction sel_text x text
sd_sel_text_sent_afinn = text_sent_afinn - sel_text_sent_afinn,
sd_sel_text_sent_bing = text_sent_bing - sel_text_sent_bing,
sd_sel_text_sent_syuzhet = text_sent_syuzhet - sel_text_sent_syuzhet,
sd_sel_text_sent_vader = text_sent_vader - sel_text_sent_vader,
sd_sel_text_n_polite = text_n_polite - sel_text_n_polite,
prop_sel_text_n_vogals = if_else(text_n_vogals == 0, 0, sel_text_n_vogals / text_n_vogals),
prop_sel_text_n_consonants = if_else(text_n_consonants == 0, 0, sel_text_n_consonants / text_n_consonants),
prop_sel_text_n_str = if_else(text_n_str == 0, 0, sel_text_n_str / text_n_str),
prop_sel_text_len = text_n_words / sel_text_n_words,
prop_sel_text_n_chars = if_else(text_n_chars == 0, 0, sel_text_n_chars / text_n_chars),
prop_sel_text_n_uq_chars = if_else(text_n_uq_chars == 0, 0, sel_text_n_uq_chars / text_n_uq_chars),
prop_sel_text_n_lowers = if_else(text_n_lowers == 0, 0, sel_text_n_lowers / text_n_lowers),
prop_sel_text_n_caps = if_else(text_n_caps == 0, 0, sel_text_n_caps / text_n_caps),
prop_sel_text_n_periods = if_else(text_n_periods == 0, 0, sel_text_n_periods / text_n_periods),
prop_sel_text_n_commas = if_else(text_n_commas == 0, 0, sel_text_n_commas / text_n_commas),
prop_sel_text_n_exclaims = if_else(text_n_exclaims == 0, 0, sel_text_n_exclaims / text_n_exclaims),
prop_sel_text_n_puncts = if_else(text_n_puncts == 0, 0, sel_text_n_puncts / text_n_puncts),
prop_sel_text_n_prepositions = if_else(text_n_prepositions == 0, 0, sel_text_n_prepositions / text_n_prepositions),
cat_sel_text_n_neg = if_else(sel_text_n_neg == 0, "no", "yes"),
cat_sel_text_n_question = if_else(sel_text_n_question == 0, "no", "yes"),
cat_sel_text_n_digits = if_else(sel_text_n_digits == 0, "no", "yes"),
cat_sel_text_n_extraspaces = if_else(sel_text_n_extraspaces == 0, "no", "yes"),
cat_sel_text_n_tobe = if_else(sel_text_n_tobe == 0, "no", "yes"),
cat_sel_text_n_first_person = if_else(sel_text_n_first_person == 0, "no", "yes"),
cat_sel_text_n_second_person = if_else(sel_text_n_second_person == 0, "no", "yes"),
cat_sel_text_n_third_person = if_else(sel_text_n_third_person == 0, "no", "yes"),
# dif_text stats
dif_text_n_words = map_dbl(dif_text, ~length(str_split(.x, pattern = " ")[[1]])),
# dif_text_n_lowersp,
# dif_text_n_capsp,
# dif_text_n_charsperword,
# interaction dif_text x text
sd_dif_text_sent_afinn = text_sent_afinn - dif_text_sent_afinn,
sd_dif_text_sent_bing = text_sent_bing - dif_text_sent_bing,
sd_dif_text_sent_syuzhet = text_sent_syuzhet - dif_text_sent_syuzhet,
sd_dif_text_sent_vader = text_sent_vader - dif_text_sent_vader,
sd_dif_text_n_polite = text_n_polite - dif_text_n_polite,
prop_dif_text_n_vogals = if_else(text_n_vogals == 0, 0, dif_text_n_vogals / text_n_vogals),
prop_dif_text_n_consonants = if_else(text_n_consonants == 0, 0, dif_text_n_consonants / text_n_consonants),
prop_dif_text_n_str = if_else(text_n_str == 0, 0, dif_text_n_str / text_n_str),
prop_dif_text_len = dif_text_n_words / text_n_words,
prop_dif_text_n_chars = if_else(text_n_chars == 0, 0, dif_text_n_chars / text_n_chars),
prop_dif_text_n_uq_chars = if_else(text_n_uq_chars == 0, 0, dif_text_n_uq_chars / text_n_uq_chars),
prop_dif_text_n_lowers = if_else(text_n_lowers == 0, 0, dif_text_n_lowers / text_n_lowers),
prop_dif_text_n_caps = if_else(text_n_caps == 0, 0, dif_text_n_caps / text_n_caps),
prop_dif_text_n_periods = if_else(text_n_periods == 0, 0, dif_text_n_periods / text_n_periods),
prop_dif_text_n_commas = if_else(text_n_commas == 0, 0, dif_text_n_commas / text_n_commas),
prop_dif_text_n_exclaims = if_else(text_n_exclaims == 0, 0, dif_text_n_exclaims / text_n_exclaims),
prop_dif_text_n_puncts = if_else(text_n_puncts == 0, 0, dif_text_n_puncts / text_n_puncts),
prop_dif_text_n_prepositions = if_else(text_n_prepositions == 0, 0, dif_text_n_prepositions / text_n_prepositions),
cat_dif_text_n_neg = if_else(dif_text_n_neg == 0, "no", "yes"),
cat_dif_text_n_question = if_else(dif_text_n_question == 0, "no", "yes"),
cat_dif_text_n_digits = if_else(dif_text_n_digits == 0, "no", "yes"),
cat_dif_text_n_extraspaces = if_else(dif_text_n_extraspaces == 0, "no", "yes"),
cat_dif_text_n_tobe = if_else(dif_text_n_tobe == 0, "no", "yes"),
cat_dif_text_n_first_person = if_else(dif_text_n_first_person == 0, "no", "yes"),
cat_dif_text_n_second_person = if_else(dif_text_n_second_person == 0, "no", "yes"),
cat_dif_text_n_third_person = if_else(dif_text_n_third_person == 0, "no", "yes"),
)
if(test == F){
metadata %>%
select(textID, text, sel_text, ngram_text, dif_text, sentiment, jaccard,everything())
}else{
metadata %>%
select(textID, text, ngram_text, dif_text, sentiment,everything())
}
}
Apply function in train and test data:
ttrain <- parse_metadata(train_metadata, test = F) %>% select(-sel_text)
ttest <- parse_metadata(test_metadata, test = T)
ttrain$sel_text <- NULL
Remove bad textIDs again (without jaccard = 1):
to_remove <-
ttrain %>%
group_by(textID) %>%
nest() %>%
ungroup() %>%
mutate(sel = map_lgl(data, ~any(.x$jaccard == 1))) %>%
filter(sel != T) %>%
pull(textID)
ttrain <- ttrain %>% filter(!textID %in% to_remove)
Sample ngrams of each text that have the same jaccard
g1 <-
ttrain %>%
ggplot(aes(x = jaccard, fill = sentiment))+
geom_density(alpha = .5)+
labs(title = "before random sample")
ttrain %>%
mutate(jaccard = case_when(jaccard == 0 ~ 0,
T ~ 1)) %>%
filter(!is.na(jaccard)) %>%
count(jaccard) %>% mutate(prop = n/sum(n))
FALSE # A tibble: 2 x 3
FALSE jaccard n prop
FALSE <dbl> <int> <dbl>
FALSE 1 0 25853 0.517
FALSE 2 1 24167 0.483
There are many zeros, let’s sample ngrams:
set.seed(1)
ttrain <-
ttrain %>%
group_by(textID) %>% nest() %>% ungroup() %>%
mutate(data = map(data, ~.x %>%
mutate(rounded_jaccard = round(jaccard, 2)) %>%
group_by(rounded_jaccard) %>%
sample_n(1)%>%
ungroup())) %>%
unnest(cols = c(data)) %>%
select(-rounded_jaccard)
g2 <-
ttrain %>%
ggplot(aes(x = jaccard, fill = sentiment))+
geom_density(alpha = .5)+
labs(title = "after random sample")
g1 / g2
For machine learning, we will use the tidymodels framework:
Pre processing with recipes:
jaccard_recipe <- recipe(ttrain, jaccard ~ .) %>%
step_rm(textID, text, ngram_text, dif_text) %>%
step_mutate(sentiment = case_when(sentiment == "positive"~1,
sentiment == "negative"~-1)) %>%
step_YeoJohnson(all_numeric(),-all_outcomes(), -sentiment) %>%
step_normalize(all_numeric(),-all_outcomes()) %>%
step_dummy(all_nominal())
Define cross validation with recipes:
set.seed(123)
jaccard_vfold <- vfold_cv(ttrain, v = 5, strata = jaccard)
Define model with parsnip:
jaccard_xgb_model <-
boost_tree(
trees = tune(),
learn_rate = tune(), # step size
tree_depth = tune(), min_n = tune(),
loss_reduction = tune(), # first three: model complexity
sample_size = tune(), mtry = tune(), # randomness
) %>%
set_mode("regression") %>%
set_engine("xgboost", nthread = ncores)
Start xgboost workflow:
jaccard_workflow <- workflow() %>% add_recipe(jaccard_recipe)
jaccard_xgb_workflow <-jaccard_workflow %>% add_model(jaccard_xgb_model)
Determine params
xgb_params <- parameters(
trees(),
learn_rate(), # step size
tree_depth(), min_n(),
loss_reduction(), # first three: model complexity
sample_size = sample_prop(), finalize(mtry(), ttrain) # randomness
)
xgb_params <- xgb_params %>% update(trees = trees(c(100, 500)))
Atualize xgboost workflow:
workflow_jaccard_xgb_model <-
workflow() %>%
add_model(jaccard_xgb_model) %>%
add_recipe(jaccard_recipe)
Iterative Bayesian optimization of a regression model:
set.seed(321)
xgb_tune <-
workflow_jaccard_xgb_model %>%
tune_bayes(
resamples = jaccard_vfold,
param_info = xgb_params,
# initial = ?,
iter = 30,
# metrics = metric_set(rmse, mape),
control = control_bayes(no_improve = 10,
save_pred = T, verbose = T)
)
autoplot(xgb_tune)
Colect predictions:
collect_predictions(xgb_tune) %>%
select(id,.pred, jaccard) %>%
gather(key, value, -id) %>%
ggplot(aes(x=value, volor = key, fill = key)) +
geom_density(alpha=.2)+
labs(x = "", y = "")+
facet_wrap(~id)+
theme(legend.position = "bottom")
Select best model:
jaccard_best_model <- select_best(xgb_tune, "rmse", maximize = F)
print(jaccard_best_model)
FALSE # A tibble: 1 x 7
FALSE mtry trees min_n tree_depth learn_rate loss_reduction sample_size
FALSE <int> <int> <int> <int> <dbl> <dbl> <dbl>
FALSE 1 30 124 15 14 0.0981 0.000000205 0.861
Fit final model:
jaccard_final_model <- finalize_model(jaccard_xgb_model, jaccard_best_model)
jaccard_workflow <- workflow_jaccard_xgb_model %>% update_model(jaccard_final_model)
jaccard_xgb_fit <- fit(jaccard_workflow, data = ttrain)
Predict jaccard for all test ngrams:
pred <- predict(jaccard_xgb_fit, ttest)
results <-
ttest %>%
bind_cols(as_tibble(pred)) %>%
select(textID, text, ngram_text, .pred) %>%
group_by(textID) %>%
top_n(1, .pred) %>%
distinct(textID, .pred, .keep_all = T) %>%
ungroup()
head(results)
FALSE # A tibble: 6 x 4
FALSE textID text ngram_text .pred
FALSE <chr> <chr> <chr> <dbl>
FALSE 1 11aa4945… http://twitpic.com/67swx - i wish i was calling yo… wish 0.850
FALSE 2 fd1db57d… i'm done.haha. HOUSE MD marathon ulet marathon 0.539
FALSE 3 2524332d… I'm concerned for that family concerned 0.736
FALSE 4 0fb19285… HEY GUYS IT'S WORKING NO NEED TO WORRY. i have too… WORRY. 1.06
FALSE 5 311d2b18… Tracy and Berwick breaks my achy breaky heart The… breaks 0.675
FALSE 6 95dfefd4… Well off 2 bed...cant wait 2 party 4 Mother's Day … like 0.750
Prepare to submit!
submission <- read_csv("../data/sample_submission.csv")
submission <-
submission %>%
select(-selected_text) %>%
left_join(
bind_rows(
select(results, textID, selected_text = ngram_text),
select(test_neutral, textID, selected_text = text)
)
)
https://www.kaggle.com/nkoprowicz/a-simple-solution-using-only-word-counts https://www.kaggle.com/khoongweihao/feature-engineering-lightgbm-model-starter-kit https://www.kaggle.com/c/tweet-sentiment-extraction/discussion/139803 https://www.kaggle.com/jonathanbesomi/question-answering-starter-pack https://machinelearningmastery.com/gentle-introduction-text-summarization/ https://www.tidymodels.org/learn/work/bayes-opt/